This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju, VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist, and plasmid DNA sequences
Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
Import the file that combines the results from each of the tools from running “combining_tool_output.Rmd”:
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
seqtype = col_character(),
contig = col_character(),
checkv_provirus = col_character(),
checkv_quality = col_character(),
method.x = col_character(),
Classified = col_character(),
IDs_all = col_character(),
Seq = col_character(),
Kaiju_Viral = col_character(),
Kingdom = col_character(),
type = col_character(),
vibrant_quality = col_character(),
method.y = col_character(),
vibrant_prophage = col_character(),
vs2type = col_character(),
max_score_group = col_character()
)
ℹ Use `spec()` for the full column specifications.
This section defines a viralness score “keep_score” based on the tool classifications. A final keep_score above 1 indicates we will keep that sequence and call it viral.
VIBRANT Quality == “High Quality Draft”: +1 Quality == “Medium Quality Draft”: +1 Quality == “Low Quality Draft” & provirus == TRUE: +0.5
Virsorter2 Viral >= 50: +0.5 Viral >= 0.95: +0.5
Virsorter category == 1,2,4,5: +1 category == 3,6: +0.5
DeepVirFinder: Score >= 0.7: +0.5 Score >= 0.9: +0.5
Kaiju: Kaiju_viral = “cellular organisms”: -1 Kaiju_viral = “Viruses”: +1
CheckV If %unknown >= 75: +0.5 Hallmark > 2: +1 viral_genes == 0 and host_genes >= 1: keep_score = 0 If 3*viral_genes <= host_genes: keep_score = 0 If length > 50,000 and hallmark == 0: keep_score = 0
This script produces visualizations of these combined viral scorings and includes ecological metrics like alpha diversity.
You can decide which combination is appropriate for them and only need use the tools appropriate for your data.
getting_viral_set_1 <- function(input_seqs,
There were 24 warnings (use warnings() to see them)
include_vibrant=FALSE,
include_virsorter2=FALSE,
include_deepvirfinder=FALSE,
include_tuning=FALSE,
include_kaiju=FALSE,
include_virsorter=FALSE) {
keep_score <- rep(0, nrow(input_seqs))
if (include_vibrant) {
keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$checkv_provirus=="Yes"] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$checkv_provirus=="Yes"] + 0.5
# keep_score[input_seqs$vibrant_quality=="low quality draft"] <- keep_score[input_seqs$vibrant_quality=="low quality draft"] + 0.5
}
if (include_virsorter2) {
keep_score[input_seqs$viral>=50] <- keep_score[input_seqs$viral>=50] + 0.5
keep_score[input_seqs$viral>=95] <- keep_score[input_seqs$viral>=95] + 0.5
}
if (include_virsorter) {
keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 1
keep_score[input_seqs$category==3] <- keep_score[input_seqs$category==3] + 0.5
keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 1
keep_score[input_seqs$category==6] <- keep_score[input_seqs$category==6] + 0.5
}
if (include_deepvirfinder) {
keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] + 0.5
keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] + 0.5
}
if (include_kaiju) {
keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 1
keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 0.5
}
if (include_tuning) {
keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 1
keep_score[input_seqs$checkv_host_genes>50 & input_seqs$checkv_provirus=="No"] <- keep_score[input_seqs$checkv_host_genes>50 & input_seqs$checkv_provirus=="No"] - 1
keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] <- keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] + 0.5
keep_score[input_seqs$percent_viral>=50] <- keep_score[input_seqs$percent_viral>=50] + 0.5
#keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] <- keep_score[input_seqs$hallmark>=(input_seqs$checkv_viral_genes/5)] + 1 #add some ratio
keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] <- keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] - 1
keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & input_seqs$checkv_provirus=="No"] <- keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & input_seqs$checkv_provirus=="No"] - 1 # consider accounting for provirus designation
# keep_score[(input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes] <- 0 # consider accounting for provirus designation
keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark==0] <- keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark==0] - 1
}
return(keep_score)
}
note that this is only as accurate as the annotations of the input sequences
this function calculates the precision, recall, and F1 score for each pipeline
assess_performance <- function(seqtype, keep_score) {
truepositive <- rep("not viral", length(seqtype))
truepositive[seqtype=="virus"] <- "viral"
#make confusion matrix
confusion_matrix <- rep("true negative", length(keep_score))
confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
TP <- table(confusion_matrix)[4]
FP <- table(confusion_matrix)[2]
TN <- table(confusion_matrix)[3]
FN <- table(confusion_matrix)[1]
precision <- TP/(TP+FP)
recall <- TP/(TP+FN)
F1 <- 2*precision*recall/(precision+recall)
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
auc <- round(auc(truepositive, keep_score),4)
#by type metrics
fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
protist_FP <- table(confusion_matrix[seqtype=="protist"])[2]
bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
performance <- c(precision, recall, F1, MCC, auc, fungal_FP,
protist_FP, bacterial_FP, viral_FN)
names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
"protist_FP", "bacterial_FP", "viral_FN")
return(performance)
}
combination of tools list
combos_list <- data.frame(toolcombo=rep(0, 64),
CheckV=rep(0, 64),
DVF=rep(0, 64),
Kaiju=rep(0, 64),
VIBRANT=rep(0, 64),
VS=rep(0, 64),
VS2=rep(0, 64))
p <- 1
for (i in c(0,1)){
for (j in c(0,1)){
for (k in c(0,1)){
for (l in c(0,1)){
for (m in c(0,1)){
for (n in c(0,1)){
combos_list$toolcombo[p] <- paste(i,j,k,l,m,n)
combos_list$toolcombo2[p] <- paste(if(i){"C"}else{"0"},if(j){"DVF"}else{"0"},
if(k){"K"}else{"0"},if(l){"VB"}else{"0"},
if(m){"VS"}else{"0"},if(n){"VS2"}else{"0"})
combos_list$CheckV[p] <- i
combos_list$DVF[p] <- j
combos_list$Kaiju[p] <- k
combos_list$VIBRANT[p] <- l
combos_list$VS[p] <- m
combos_list$VS2[p] <- n
p <- p+1
}
}
}
}
}
}
combos_list <- combos_list[-1,]
this function builds a list of all of the combinations that the user wants to test. In this case, we’re comparing the performance of all unique combinations of the six tools.
build_score_list <- function(input_seqs, combos) {
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
output <- data.frame(precision=rep(0, nrow(combos)),
recall=rep(0, nrow(combos)),
F1=rep(0, nrow(combos)),
MCC=rep(0, nrow(combos)),
AUC=rep(0, nrow(combos)),
fungal_FP=rep(0, nrow(combos)),
protist_FP=rep(0, nrow(combos)),
bacterial_FP=rep(0, nrow(combos)),
viral_FN=rep(0, nrow(combos)))
for (i in 1:nrow(combos)) {
keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
include_virsorter = combos$VS[i],
include_virsorter2 = combos$VS2[i],
include_tuning = combos$CheckV[i],
include_kaiju = combos$Kaiju[i],
include_deepvirfinder = combos$DVF[i])
output[i,1:9] <- assess_performance(input_seqs$seqtype, keep_score)
output$toolcombo[i] <- paste(combos$CheckV[i],combos$DVF[i],
combos$Kaiju[i], combos$VIBRANT[i],
combos$VS[i], combos$VS2[i])
}
output[is.na(output)] <- 0
return (output)
}
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
precision=rep(0, nrow(combos_list)*10),
recall=rep(0, nrow(combos_list)*10),
F1=rep(0, nrow(combos_list)*10),
MCC=rep(0, nrow(combos_list)*10),
AUC=rep(0, nrow(combos_list)*10),
fungal_FP=rep(0, nrow(combos_list)*10),
protist_FP=rep(0, nrow(combos_list)*10),
bacterial_FP=rep(0, nrow(combos_list)*10),
viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
build_score_list(viruses[viruses$Index==1,], combos_list))
for (i in 2:10) {
accuracy_scores <- rbind(accuracy_scores,
cbind(testing_set_index=rep(i, nrow(combos_list)),
build_score_list(viruses[viruses$Index==i,], combos_list)))
}
library("stringr")
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
accuracy_scores$numtools <- str_count(accuracy_scores$toolcombo, "1")
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numtools, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numtools <- as.factor(accuracy_scores$numtools)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("F1 Score")
p2
ggplot(accuracy_scores, aes(x=toolcombo, y=precision,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Precision")
ggplot(accuracy_scores, aes(x=toolcombo, y=recall,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=precision, y=recall,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Precision") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall),
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Precision-Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=MCC,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("MCC")
ggplot(accuracy_scores, aes(x=toolcombo, y=AUC,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("AUC")
ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Fungal False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=protist_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Protist False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Bacterial False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numtools, fill=numtools)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Viral False Negatives")
write_tsv(accuracy_scores, "20220927_accuracy_scores.tsv")
to do: add in clustering and ordination like in the drinking water R notebook
viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = F,
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
include_vibrant = T,
include_virsorter2 = F,
include_kaiju = T,
include_tuning = T,
include_virsorter = F)
viruses$confusion_matrix_high_precision <- "true negative"
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_recall`.
2: Unknown or uninitialised column: `confusion_matrix_high_recall`.
3: Unknown or uninitialised column: `confusion_matrix_high_recall`.
4: Unknown or uninitialised column: `keep_score_visualize`.
5: Unknown or uninitialised column: `truepositive`.
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=checkv_viral_genes, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Viral Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=percent_viral, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Percent Genes Viral") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses_false_positive <- viruses[viruses$confusion_matrix_high_precision=="false positive",]
viruses_false_negative <- viruses[viruses$confusion_matrix_high_precision=="false negative",]
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=checkv_length,
color=checkv_length,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="bacteria"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="fungi"], aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="protist"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
table(viruses$hallmark[viruses$confusion_matrix_high_precision=="false positive"]>0)
FALSE TRUE
4398 2608
table(viruses$percent_host[viruses$confusion_matrix_high_precision=="false positive"]<50)
FALSE TRUE
855 6151
viruses$keep_score_high_MCC <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 41 warnings (use warnings() to see them)
include_vibrant = T,
include_virsorter2 = F,
include_kaiju = T,
include_tuning = T,
include_virsorter = T)
viruses$confusion_matrix_high_MCC <- "true negative"
There were 22 warnings (use warnings() to see them)
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC<1] <- "false negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC>=1] <- "true positive"
viruses$confusion_matrix_high_MCC[viruses$seqtype!="virus" & viruses$keep_score_high_MCC>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_MCC, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_MCC, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.Warning messages:
1: Unknown or uninitialised column: `confusion_matrix_high_precision`.
2: Unknown or uninitialised column: `confusion_matrix_high_precision`.
3: Unknown or uninitialised column: `confusion_matrix_high_precision`.
4: Unknown or uninitialised column: `confusion_matrix_high_recall`.
5: Unknown or uninitialised column: `confusion_matrix_high_recall`.
6: Unknown or uninitialised column: `confusion_matrix_high_recall`.
7: Unknown or uninitialised column: `keep_score_visualize`.
8: Unknown or uninitialised column: `truepositive`.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = T,
There were 15 warnings (use warnings() to see them)
include_vibrant = T,
include_virsorter2 = T,
include_kaiju = T,
include_tuning = T,
include_virsorter = T)
viruses$confusion_matrix_high_recall <- "true negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall<1] <- "false negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix_high_recall[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
Warning messages:
1: Unknown or uninitialised column: `keep_score_visualize`.
2: Unknown or uninitialised column: `truepositive`.
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
p2
looking at false negatives
viruses_false_negs <- viruses[(viruses$seqtype=="virus" & viruses$keep_score_high_recall<=1),]
viruses$keep_score_visualize <- viruses$keep_score
viruses$keep_score_visualize[viruses$keep_score>1] <- ">1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
levels=c("-0.5", "-1", "0", "0.5","1", ">1"))
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
levels(factor(viruses$keep_score_visualize))
pal <- ggthemes::tableau_color_pal(palette="Tableau 20", type="regular")
p1 <- ggplot(viruses, aes(x=as.factor(Index),
fill=keep_score_visualize, color=keep_score_visualize)) +
geom_bar(stat="count", position="stack") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16)
) +
scale_color_manual(name = 'Viral Score',
values = alpha(c(pal(4)), 1)) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(pal(4)), 0.5)) +
xlab("Index") +
ylab("Sequence Count") +
facet_wrap(~confusion_matrix, scales = "free")
p1
library(pROC)
viruses$truepositive <- rep(0, nrow(viruses))
viruses$truepositive[viruses$seqtype=="virus"] <- 1
rocobj <- roc(viruses$truepositive, viruses$keep_score)
rocobj_all <- roc(viruses$truepositive, viruses$keep_score_all)
auc <- round(auc(viruses$truepositive, viruses$keep_score),4)
auc_all <- round(auc(viruses$truepositive, viruses$keep_score_all),4)
#create ROC plot
ggroc(rocobj, colour = 'steelblue', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
coord_equal()
ggroc(rocobj_all, colour = 'green', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc_all, ')'))
Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive. Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.
viral_scores <- matrix(data=0, nrow=nrow(viruses), ncol=nrow(combos_list))
num_viruses <- data.frame(toolcombo=rep(0, nrow(combos_list)),
num_viruses=rep(0, nrow(combos_list)))
for (i in 1:nrow(combos_list)) {
viral_scores[,i] <- getting_viral_set_1(viruses, include_vibrant = combos_list$VIBRANT[i],
include_virsorter = combos_list$VS[i],
include_virsorter2 = combos_list$VS2[i],
include_tuning = combos_list$CheckV[i],
include_kaiju = combos_list$Kaiju[i],
include_deepvirfinder = combos_list$DVF[i])
num_viruses$num_viruses[i] <- table(viral_scores[,i]>=1)[[2]]
num_viruses$toolcombo[i] <- combos_list$toolcombo[i]
num_viruses$toolcombo2[i] <- combos_list$toolcombo2[i]
}
num_viruses$numtools <- str_count(num_viruses$toolcombo, "1")
num_viruses <- num_viruses[order(num_viruses$num_viruses, decreasing=F),]
num_viruses$toolcombo <- factor(num_viruses$toolcombo, levels = unique(num_viruses$toolcombo))
num_viruses$toolcombo2 <- factor(num_viruses$toolcombo2, levels = unique(num_viruses$toolcombo2))
num_viruses$numtools <- as.factor(num_viruses$numtools)
ggplot(num_viruses, aes(x=toolcombo, y=num_viruses,
color=numtools, fill=numtools)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=toolcombo2, y=num_viruses,
color=numtools, fill=numtools)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=numtools, y=num_viruses)) +
geom_boxplot(aes(color=numtools)) +
geom_point(aes(color=numtools, fill=numtools)) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Tools") +
ylab("Num Viruses Predicted")
viral_scores_nozeros <- viral_scores[rowSums(viral_scores)>0,]
viral_scores_nozeros <- viral_scores_nozeros + 1
viral_scores_nozeros <- as.data.frame(viral_scores_nozeros)
colnames(viral_scores_nozeros) <- num_viruses$toolcombo2
tooldata <- num_viruses
rownames(tooldata) <- tooldata$toolcombo2
tooldata <- num_viruses
rownames(tooldata) <- tooldata$toolcombo2
physeq_pooled <- phyloseq(otu_table(viral_scores_nozeros, taxa_are_rows = T),
sample_data(tooldata))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=tooldata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numtools", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
to do: try coloring above based on the F1 scores of the testing set on each combination
bray_dist <- phyloseq::distance(physeq_pooled, method="bray")
clusters <- hclust(dist(bray_dist))
plot(clusters)
myclusters <- cutree(clusters, h=1.1)
names(myclusters[myclusters==1])
names(myclusters[myclusters==2])
names(myclusters[myclusters==3])
names(myclusters[myclusters==4])
names(myclusters[myclusters==5])
myclusters_df <- tibble(combo=names(myclusters),
cluster_index=myclusters)
myclusters_df <- separate(myclusters_df, col=combo, into=c("CheckV", "DVF",
"Kaiju", "VIBRANT",
"VirSorter", "VirSorter2"),
sep=" ", remove = F)
tool_count <- as.data.frame(rbind(table(myclusters_df$CheckV, myclusters_df$cluster_index)[2,],
table(myclusters_df$DVF, myclusters_df$cluster_index)[2,],
table(myclusters_df$Kaiju, myclusters_df$cluster_index)[2,],
table(myclusters_df$VIBRANT, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter2, myclusters_df$cluster_index)[2,])
)
tool_count$method <- c("CheckV", "DVF", "Kaiju", "VIBRANT", "VirSorter", "VirSorter2")
tool_count <- melt(tool_count)
colnames(tool_count) <- c("tool", "cluster_index", "tool_count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(tool_count, aes(x=cluster_index, y=tool_count,
fill=cluster_index,
color=cluster_index)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Cluster") +
ylab("Number of Times in Cluster") +
facet_wrap(~tool, scales = "free")